Source code for nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_pages

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************


[docs]class WikipediaPages(object): def __init__(self): """ Object represent a set of Wikipedia Pages """ self.pages = set() self.is_empty_norm_phrase = True
[docs] def get_pages(self): return self.pages
[docs] def add_page(self, page): self.pages.add(page) if page.orig_phrase_norm is not None and page.orig_phrase_norm != '': self.is_empty_norm_phrase = False
[docs] def get_and_set_all_disambiguation(self): all_disambiguations = [] for page in self.pages: if page.relations.disambiguation_links_norm is not None: all_disambiguations.extend(page.relations.disambiguation_links_norm) if page.relations.disambiguation_links is not None: all_disambiguations.extend(page.relations.disambiguation_links) return set(all_disambiguations)
[docs] def get_and_set_all_categories(self): all_categories = [] for page in self.pages: if page.relations.categories_norm is not None: all_categories.extend(page.relations.categories_norm) if page.relations.categories is not None: all_categories.extend(page.relations.categories) return set(all_categories)
[docs] def get_and_set_all_aliases(self): all_aliases = [] for page in self.pages: if page.relations.aliases_norm is not None: all_aliases.extend(page.relations.aliases_norm) if page.relations.aliases is not None: all_aliases.extend(page.relations.aliases) return set(all_aliases)
[docs] def get_and_set_parenthesis(self): all_parenthesis = [] for page in self.pages: if page.relations.title_parenthesis_norm is not None: all_parenthesis.extend(page.relations.title_parenthesis_norm) if page.relations.title_parenthesis is not None: all_parenthesis.extend(page.relations.title_parenthesis) return set(all_parenthesis)
[docs] def get_and_set_be_comp(self): all_be_comp = [] for page in self.pages: if page.relations.be_comp_norm is not None: all_be_comp.extend(page.relations.be_comp_norm) if page.relations.be_comp is not None: all_be_comp.extend(page.relations.be_comp) return set(all_be_comp)
[docs] def get_and_set_titles(self): all_titles = [] for page in self.pages: if page.orig_phrase != '': all_titles.append(page.orig_phrase) all_titles.append(page.orig_phrase_norm) if page.wiki_title != '': all_titles.append(page.wiki_title) all_titles.append(page.wiki_title_norm) return set(all_titles)
[docs] def toJson(self): result_dict = {} page_list = [] for page in self.pages: page_list.append(page.toJson()) result_dict['pages'] = page_list return result_dict
def __str__(self) -> str: result_str = '' for page in self.pages: result_str += str(page) + ', ' return result_str.strip()